library(readxl)
library(fixest)
library(car)
library(lme4)
library(ggplot2)
library(dplyr)
library(ggeffects)
library(scales)
library(tidyverse)
library(boot)
library(sandwich)
library(lmtest)
library(olsrr)
library(splines)
library(gridExtra)
options(scipen=999)

# Load data
par_2018 <- read_xlsx("MALAYSIA_2018_PARLIAMENT_RESULTS.xlsx")

# East Malaysia Excluded
par_2018_subset <- subset(par_2018, 
                          !(STATE %in% c("SABAH", "SARAWAK", 
                                         "WILAYAH PERSEKUTUAN LABUAN")))

par_2018_subset$STATE <- tools::toTitleCase(tolower(par_2018_subset$STATE))


# Figure 3 Replication
par_2018_plot <- subset(par_2018_subset, !(STATE %in% "Wilayah Persekutuan Putrajaya"))

ggplot(par_2018_subset, aes(x = Population.Density, fill = STATE)) +
  geom_histogram(position = "identity", alpha = 0.6, bins = 30) +
  labs(#title = "Distribution of Population Density by State"
    x = expression("Population Density (1,000/km"^2*")"), y = "Number Count") +
  theme_minimal()

ggplot(par_2018_plot,
       aes(x = reorder(STATE, Population.Density, median),
           y = Population.Density, fill = STATE)) +
  geom_violin(alpha = 0.6, color = NA) +
  geom_boxplot(width = 0.12, outlier.size = 0.5, alpha = 0.9) +
  coord_flip() +
  labs(x = NULL, y = "Population Density") +
  theme_minimal() +
  guides(fill = "none")

# Base Model
base_model <- lm(BN.Share~Malay.Share + Population.Density, 
                 data = par_2018_subset)
base_model_plot <- ols_plot_cooksd_bar(lm(BN.Share~Malay.Share + Population.Density, 
                 data = par_2018_subset), print_plot = TRUE)

cooks_d <- cooks.distance(base_model)

influential_points <- which(cooks_d > 0.024)

print(influential_points)


# Compute HC2 robust standard errors (Table 4)
robust_se_HC2 <- sqrt(diag(vcovHC(base_model, type = "HC2")))

coeftest(base_model, vcov. = vcovHC(base_model, type = "HC2"))

par_2018_subset %>% 
  mutate(z_score = (Population.Density - mean(Population.Density)/mean(Population.Density)))
         
# Fit random slope model
re_model2 <- lmer(
  BN.Share ~ Malay.Share + Population.Density + z_score + 
    (Population.Density | STATE),
  data = par_2018_subset
)

# Original fixed + random effects slopes
re_df   <- ranef(re_model2, condVar = TRUE)$STATE
fixed_slope  <- fixef(re_model2)["Population.Density"]
state_slope  <- fixed_slope + re_df[, "Population.Density"]
states <- rownames(re_df)

# Bootstrap
n_boot <- 100000
set.seed(123)

# Function to extract state-level slope estimates
boot_fun <- function(data, indices) {
  d <- data[indices, ]
  
  m <- suppressWarnings(
    tryCatch(
      lmer(BN.Share ~ Malay.Share + Population.Density + (1 + Population.Density | STATE), data = d),
      error = function(e) NULL
    )
  )
  
  if (!is.null(m) && inherits(m, "merMod") && !isSingular(m)) {
    re <- ranef(m)$STATE
    fs <- fixef(m)["Population.Density"]
    return(fs + re[states, "Population.Density"])
  } else {
    return(rep(NA, length(states)))
  }
}

# Run bootstrap
boot_results <- suppressMessages(
  suppressWarnings(
    boot(data = par_2018_subset, statistic = boot_fun, R = n_boot)
  )
)

# Extract SEs (by column = state)
boot_se <- apply(boot_results$t, 2, sd, na.rm = TRUE)

# Build final dataframe (Table 5)
pd_df_boot <- data.frame(
  STATE    = states,
  estimate = state_slope,
  SE       = boot_se,
  lower.CI = state_slope - 1.96 * boot_se,
  upper.CI = state_slope + 1.96 * boot_se
)


# Figure 5
ggplot(pd_df_boot, aes(x = reorder(STATE, estimate), y = estimate)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey70") +
  geom_errorbar(
    aes(ymin = lower.CI, ymax = upper.CI),
    width = 0.3,
    color = "#2A4B7C"
  ) +
  geom_point(color = "#2A4B7C", size = 2) +
  coord_flip() +
  scale_y_continuous(
    labels = label_number(accuracy = 0.001),
    breaks = pretty_breaks(n = 5)
  ) +
  labs(
    x = NULL,
    y = "Slope (ΔBN.Share per unit of Population Density)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    axis.text.y        = element_text(size = 10, family = "Times New Roman"),
    axis.text.x        = element_text(size = 10, family = "Times New Roman"),
    axis.title.x       = element_text(size = 10, family = "Times New Roman"),
    plot.margin        = margin(15, 15, 15, 15)
  )

# Appendix A
par_2018_subset_2 <- par_2018_subset %>% 
  mutate(
    z_score = (Population.Density - mean(Population.Density)) / sd(Population.Density)
  )

# Fit random slope model
re_model3 <- lmer(
  BN.Share ~ Malay.Share + Population.Density + z_score + 
    (Population.Density | STATE),
  data = par_2018_subset_2
)

# Original fixed + random effects slopes
re_df_3        <- ranef(re_model3, condVar = TRUE)$STATE
fixed_slope_3  <- fixef(re_model3)["Population.Density"]
state_slope_3  <- fixed_slope_3 + re_df_3[, "Population.Density"]
states         <- rownames(re_df_3)

# Bootstrap
n_boot <- 100000
set.seed(123)

# Function to extract state-level slope estimates
boot_fun <- function(data, indices) {
  d <- data[indices, ]
  
  m <- suppressWarnings(
    tryCatch(
      lmer(BN.Share ~ Malay.Share + Population.Density + z_score + (Population.Density | STATE), data = d),
      error = function(e) NULL
    )
  )
  
  if (!is.null(m) && inherits(m, "merMod") && !isSingular(m)) {
    re <- ranef(m)$STATE
    fs <- fixef(m)["Population.Density"]
    return(fs + re[states, "Population.Density"])
  } else {
    return(rep(NA, length(states)))
  }
}

# Run bootstrap
boot_results_3 <- suppressMessages(
  suppressWarnings(
    boot(data = par_2018_subset_2, statistic = boot_fun, R = n_boot)
  )
)

# Extract SEs (by column = state)
boot_se_3 <- apply(boot_results_3$t, 2, sd, na.rm = TRUE)

# Build final dataframe (Table Appendix A)
pd_df_boot_3 <- data.frame(
  STATE    = states,
  estimate = state_slope_3,
  SE       = boot_se_3,
  lower.CI = state_slope_3 - 1.96 * boot_se_3,
  upper.CI = state_slope_3 + 1.96 * boot_se_3
)

# Figure 5
ggplot(pd_df_boot_3, aes(x = reorder(STATE, estimate), y = estimate)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey70") +
  geom_errorbar(aes(ymin = lower.CI, ymax = upper.CI), width = 0.3, color = "#2A4B7C") +
  geom_point(color = "#2A4B7C", size = 2) +
  coord_flip() +
  scale_y_continuous(labels = label_number(accuracy = 0.001), breaks = pretty_breaks(n = 5)) +
  labs(x = NULL, y = "Slope (ΔBN.Share per unit of Population Density)") +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    axis.text.y        = element_text(size = 10, family = "Times New Roman"),
    axis.text.x        = element_text(size = 10, family = "Times New Roman"),
    axis.title.x       = element_text(size = 10, family = "Times New Roman"),
    plot.margin        = margin(15, 15, 15, 15)
  )
